{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 40,
   "metadata": {},
   "outputs": [],
   "source": [
    "import csv\n",
    "import random\n",
    "import jsonlines\n",
    "import time\n",
    "import pandas as pd\n",
    "from datetime import datetime"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 41,
   "metadata": {},
   "outputs": [],
   "source": [
    "def compute_duration(start_time: str, end_time: str, time_format: str = \"%H:%M\") -> str:\n",
    "    start_time_obj = datetime.strptime(start_time, time_format)\n",
    "    end_time_obj = datetime.strptime(end_time, time_format)\n",
    "\n",
    "    duration = str(end_time_obj - start_time_obj)\n",
    "    hours, minutes, seconds = duration.split(\":\")\n",
    "    return hours, minutes, seconds"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 42,
   "metadata": {},
   "outputs": [],
   "source": [
    "def read_csv(file_path):\n",
    "    with open(file_path, \"r\") as f:\n",
    "        reader = csv.reader(f)\n",
    "        data = list(reader)\n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "def check_case(content, header_dict):\n",
    "    a = {}\n",
    "    for header in header_dict.keys():\n",
    "        a[header] = content[header_dict[header]]\n",
    "    print(a)\n",
    "    input()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 44,
   "metadata": {},
   "outputs": [],
   "source": [
    "def dataset_stats(data):\n",
    "    date = {}\n",
    "    for row in data:\n",
    "        if not row[0] in date:\n",
    "            date[row[0]] = 1\n",
    "        else:\n",
    "            date[row[0]] += 1\n",
    "    return date"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 45,
   "metadata": {},
   "outputs": [],
   "source": [
    "def wrap_data(header, data):\n",
    "    pre_processed_data = []\n",
    "    row_id = 0\n",
    "    for row in data:\n",
    "        content = row\n",
    "        pre_processed_data.append({\"id\":\"flights-{:0>8d}\".format(row_id), \"header\": header, \"content\":content})\n",
    "        row_id += 1\n",
    "    return pre_processed_data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Index(['FlightDate', 'Airline', 'Origin', 'Dest', 'Cancelled', 'Diverted',\n",
      "       'CRSDepTime', 'DepTime', 'DepDelayMinutes', 'DepDelay', 'ArrTime',\n",
      "       'ArrDelayMinutes', 'AirTime', 'CRSElapsedTime', 'ActualElapsedTime',\n",
      "       'Distance', 'Year', 'Quarter', 'Month', 'DayofMonth', 'DayOfWeek',\n",
      "       'Marketing_Airline_Network', 'Operated_or_Branded_Code_Share_Partners',\n",
      "       'DOT_ID_Marketing_Airline', 'IATA_Code_Marketing_Airline',\n",
      "       'Flight_Number_Marketing_Airline', 'Operating_Airline',\n",
      "       'DOT_ID_Operating_Airline', 'IATA_Code_Operating_Airline',\n",
      "       'Tail_Number', 'Flight_Number_Operating_Airline', 'OriginAirportID',\n",
      "       'OriginAirportSeqID', 'OriginCityMarketID', 'OriginCityName',\n",
      "       'OriginState', 'OriginStateFips', 'OriginStateName', 'OriginWac',\n",
      "       'DestAirportID', 'DestAirportSeqID', 'DestCityMarketID', 'DestCityName',\n",
      "       'DestState', 'DestStateFips', 'DestStateName', 'DestWac', 'DepDel15',\n",
      "       'DepartureDelayGroups', 'DepTimeBlk', 'TaxiOut', 'WheelsOff',\n",
      "       'WheelsOn', 'TaxiIn', 'CRSArrTime', 'ArrDelay', 'ArrDel15',\n",
      "       'ArrivalDelayGroups', 'ArrTimeBlk', 'DistanceGroup',\n",
      "       'DivAirportLandings'],\n",
      "      dtype='object')\n"
     ]
    }
   ],
   "source": [
    "start_time = time.time()\n",
    "file_path = \"/<YOUR_OWN_PATH>/ToolQA/data/raw_data/flights/Combined_Flights_2022.csv\"\n",
    "data = read_csv(file_path)\n",
    "df = pd.read_csv(file_path)\n",
    "print(df.columns)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [],
   "source": [
    "num_questions_per_template = 10\n",
    "question_id = 0\n",
    "questions = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0000', 'question': 'What was the departure time of the AA2319 flight from MIA to LAS on 2022-06-05?', 'answer': '21:43'}, {'qid': 'easy-flight-0001', 'question': 'What was the departure time of the AA4869 flight from HHH to CLT on 2022-05-06?', 'answer': '12:16'}, {'qid': 'easy-flight-0002', 'question': 'What was the departure time of the DL5172 flight from SBN to ATL on 2022-01-03?', 'answer': '12:33'}, {'qid': 'easy-flight-0003', 'question': 'What was the departure time of the DL891 flight from SEA to LAX on 2022-01-22?', 'answer': '11:54'}, {'qid': 'easy-flight-0004', 'question': 'What was the departure time of the DL414 flight from LGA to ATL on 2022-02-07?', 'answer': '18:45'}, {'qid': 'easy-flight-0005', 'question': 'What was the departure time of the DL2759 flight from MCO to DTW on 2022-05-14?', 'answer': '16:49'}, {'qid': 'easy-flight-0006', 'question': 'What was the departure time of the DL82 flight from MEM to ATL on 2022-01-18?', 'answer': '12:53'}, {'qid': 'easy-flight-0007', 'question': 'What was the departure time of the WN2392 flight from PDX to OAK on 2022-04-12?', 'answer': '13:18'}, {'qid': 'easy-flight-0008', 'question': 'What was the departure time of the F9124 flight from JAX to SJU on 2022-02-28?', 'answer': '11:25'}, {'qid': 'easy-flight-0009', 'question': 'What was the departure time of the AA2792 flight from SAT to DFW on 2022-01-07?', 'answer': '7:51'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"What was the departure time of the {}{} flight from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"DepTime\"] == '':\n",
    "            DepTime = \"0\"\n",
    "            answer = DepTime\n",
    "        else:\n",
    "            DepTime = str(int(float(df.iloc[random_indice][\"DepTime\"])))\n",
    "            answer = DepTime[:-2] + \":\" + DepTime[-2:]\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0010', 'question': 'Was the flight DL5097 from MLU to ATL cancelled on 2022-02-05?', 'answer': 'No'}, {'qid': 'easy-flight-0011', 'question': 'Was the flight AS715 from EWR to SEA cancelled on 2022-03-11?', 'answer': 'No'}, {'qid': 'easy-flight-0012', 'question': 'Was the flight UA2106 from LAS to IAH cancelled on 2022-02-05?', 'answer': 'No'}, {'qid': 'easy-flight-0013', 'question': 'Was the flight AA6152 from CHO to CLT cancelled on 2022-07-06?', 'answer': 'No'}, {'qid': 'easy-flight-0014', 'question': 'Was the flight UA3981 from TVC to ORD cancelled on 2022-03-18?', 'answer': 'No'}, {'qid': 'easy-flight-0015', 'question': 'Was the flight UA348 from OGG to ORD cancelled on 2022-06-29?', 'answer': 'No'}, {'qid': 'easy-flight-0016', 'question': 'Was the flight WN5333 from RDU to BWI cancelled on 2022-01-08?', 'answer': 'No'}, {'qid': 'easy-flight-0017', 'question': 'Was the flight WN2465 from SAN to DEN cancelled on 2022-01-10?', 'answer': 'No'}, {'qid': 'easy-flight-0018', 'question': 'Was the flight AA5566 from CLT to LEX cancelled on 2022-01-20?', 'answer': 'No'}, {'qid': 'easy-flight-0019', 'question': 'Was the flight AA2289 from MIA to LAX cancelled on 2022-02-20?', 'answer': 'No'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    random_indice = random.randint(0, len(df)-1)\n",
    "    carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "    number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "    Origin = df.iloc[random_indice][\"Origin\"]\n",
    "    Dest = df.iloc[random_indice][\"Dest\"]\n",
    "    FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "    question = \"Was the flight {}{} from {} to {} cancelled on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "    Cancelled = df.iloc[random_indice][\"Cancelled\"]\n",
    "    if Cancelled == False:\n",
    "        answer = \"No\"\n",
    "    else:\n",
    "        answer = \"Yes\"\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0020', 'question': 'What is the flight number of the SkyWest Airlines Inc. flight from IAH to PIB on 2022-06-24?', 'answer': '5107'}, {'qid': 'easy-flight-0021', 'question': 'What is the flight number of the Delta Air Lines Inc. flight from MCO to LGA on 2022-04-30?', 'answer': '2873, 1176, 1340, 2398'}, {'qid': 'easy-flight-0022', 'question': 'What is the flight number of the Mesa Airlines Inc. flight from IAH to RDU on 2022-01-12?', 'answer': '6081'}, {'qid': 'easy-flight-0023', 'question': 'What is the flight number of the Alaska Airlines Inc. flight from RDU to SEA on 2022-06-18?', 'answer': '905, 1309'}, {'qid': 'easy-flight-0024', 'question': 'What is the flight number of the Commutair Aka Champlain Enterprises, Inc. flight from CHO to IAD on 2022-01-07?', 'answer': '4258'}, {'qid': 'easy-flight-0025', 'question': 'What is the flight number of the Southwest Airlines Co. flight from MIA to BWI on 2022-02-11?', 'answer': '159, 836, 1185, 1647'}, {'qid': 'easy-flight-0026', 'question': 'What is the flight number of the Frontier Airlines Inc. flight from SJU to MCO on 2022-07-14?', 'answer': '111, 105, 107'}, {'qid': 'easy-flight-0027', 'question': 'What is the flight number of the Delta Air Lines Inc. flight from BOS to MCO on 2022-06-06?', 'answer': '1400, 1555, 2974'}, {'qid': 'easy-flight-0028', 'question': 'What is the flight number of the Delta Air Lines Inc. flight from ATL to MIA on 2022-01-27?', 'answer': '1372, 1381, 1538, 2251, 2279, 2301, 2352, 2860'}, {'qid': 'easy-flight-0029', 'question': 'What is the flight number of the Southwest Airlines Co. flight from ONT to LAS on 2022-07-08?', 'answer': '1452, 1683, 2007, 2078, 2663'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        Airline = df.iloc[random_indice][\"Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"What is the flight number of the {} flight from {} to {} on {}?\".format(Airline, Origin, Dest, FlightDate)\n",
    "        sub_table = df[(df[\"Airline\"] == Airline) & (df[\"Origin\"] == Origin) & (df[\"Dest\"] == Dest) & (df[\"FlightDate\"] == FlightDate)]\n",
    "        if len(sub_table) == 0:\n",
    "            answer = \"0\"\n",
    "        else:\n",
    "            answer = \", \".join([str(i) for i in list(sub_table[\"Flight_Number_Marketing_Airline\"].unique())])\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])\n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0030', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the DL5273 flight from JFK to CVG on 2022-03-29?', 'answer': '5'}, {'qid': 'easy-flight-0031', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the WN573 flight from PHX to SMF on 2022-04-15?', 'answer': '30'}, {'qid': 'easy-flight-0032', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the DL3826 flight from LAX to ABQ on 2022-06-15?', 'answer': '6'}, {'qid': 'easy-flight-0033', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the B6792 flight from TPA to BOS on 2022-01-13?', 'answer': '44'}, {'qid': 'easy-flight-0034', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the DL5233 flight from MSP to CVG on 2022-07-21?', 'answer': '53'}, {'qid': 'easy-flight-0035', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the DL712 flight from ATL to IND on 2022-06-08?', 'answer': '2'}, {'qid': 'easy-flight-0036', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the AA2020 flight from TPA to CLT on 2022-02-07?', 'answer': '3'}, {'qid': 'easy-flight-0037', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the UA3430 flight from EWR to RDU on 2022-06-10?', 'answer': '11'}, {'qid': 'easy-flight-0038', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the AA1235 flight from DFW to LAX on 2022-07-31?', 'answer': '3'}, {'qid': 'easy-flight-0039', 'question': 'How long was the different between the CRS recorded departure time and actual departure time of the UA3831 flight from SBN to ORD on 2022-06-13?', 'answer': '16'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"How long was the different between the CRS recorded departure time and actual departure time of the {}{} flight from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"DepTime\"] == '' or df.iloc[random_indice][\"CRSDepTime\"] == '' or pd.isna(df.iloc[random_indice][\"DepTime\"]) or pd.isna(df.iloc[random_indice][\"CRSDepTime\"]):\n",
    "            DepTime = \"0\"\n",
    "            answer = DepTime\n",
    "        else:\n",
    "            DiffTime = abs(float(df.iloc[random_indice][\"DepTime\"]) - float(df.iloc[random_indice][\"CRSDepTime\"]))\n",
    "            DepTime = str(int(DiffTime))\n",
    "            answer = DepTime\n",
    "            # answer = DepTime[:-2] + \":\" + DepTime[-2:]\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0040', 'question': 'How long did UA3449 delay when arrival on 2022-03-24?', 'answer': '15.0'}, {'qid': 'easy-flight-0041', 'question': 'How long did UA5482 delay when arrival on 2022-06-06?', 'answer': '124.0'}, {'qid': 'easy-flight-0042', 'question': 'How long did AS218 delay when arrival on 2022-03-24?', 'answer': '26.0'}, {'qid': 'easy-flight-0043', 'question': 'How long did AA4631 delay when arrival on 2022-06-27?', 'answer': '19.0'}, {'qid': 'easy-flight-0044', 'question': 'How long did WN4822 delay when arrival on 2022-07-30?', 'answer': '2.0'}, {'qid': 'easy-flight-0045', 'question': 'How long did G42595 delay when arrival on 2022-04-03?', 'answer': '62.0'}, {'qid': 'easy-flight-0046', 'question': 'How long did WN2519 delay when arrival on 2022-02-21?', 'answer': '3.0'}, {'qid': 'easy-flight-0047', 'question': 'How long did DL3541 delay when arrival on 2022-04-24?', 'answer': '399.0'}, {'qid': 'easy-flight-0048', 'question': 'How long did NK3055 delay when arrival on 2022-02-17?', 'answer': 'nan'}, {'qid': 'easy-flight-0049', 'question': 'How long did AS3388 delay when arrival on 2022-02-12?', 'answer': '28.0'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        question = \"How long did {}{} delay when arrival on {}?\".format(carrier, number, FlightDate)\n",
    "        if df.iloc[random_indice][\"ArrDelayMinutes\"] == \"\":\n",
    "            ArrDelay = \"0\"\n",
    "            answer = ArrDelay\n",
    "        else:\n",
    "            ArrDelay = str(float(df.iloc[random_indice][\"ArrDelayMinutes\"]))\n",
    "            answer = str(float(ArrDelay))\n",
    "        if answer == '0.0':\n",
    "            answer = '0'\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0050', 'question': 'How many extra minutes did the WN2522 flight take from ECP to ATL on 2022-03-27?', 'answer': '-10.0'}, {'qid': 'easy-flight-0051', 'question': 'How many extra minutes did the DL2058 flight take from MCO to RDU on 2022-04-13?', 'answer': '-10.0'}, {'qid': 'easy-flight-0052', 'question': 'How many extra minutes did the WN1749 flight take from MKE to DCA on 2022-05-23?', 'answer': '-1.0'}, {'qid': 'easy-flight-0053', 'question': 'How many extra minutes did the B6257 flight take from BOS to BZN on 2022-03-26?', 'answer': '-13.0'}, {'qid': 'easy-flight-0054', 'question': 'How many extra minutes did the WN2361 flight take from AUS to FLL on 2022-06-16?', 'answer': '-19.0'}, {'qid': 'easy-flight-0055', 'question': 'How many extra minutes did the HA73 flight take from ONT to HNL on 2022-01-13?', 'answer': '-28.0'}, {'qid': 'easy-flight-0056', 'question': 'How many extra minutes did the WN1415 flight take from ATL to LGA on 2022-01-02?', 'answer': '-14.0'}, {'qid': 'easy-flight-0057', 'question': 'How many extra minutes did the AA1855 flight take from AUS to SJU on 2022-07-13?', 'answer': '13.0'}, {'qid': 'easy-flight-0058', 'question': 'How many extra minutes did the AA5027 flight take from DCA to PWM on 2022-04-12?', 'answer': '-9.0'}, {'qid': 'easy-flight-0059', 'question': 'How many extra minutes did the DL5698 flight take from DCA to JFK on 2022-07-01?', 'answer': '23.0'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"How many extra minutes did the {}{} flight take from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"ArrDelay\"] == \"\":\n",
    "            ArrDelay = \"0\"\n",
    "            error_flag = True\n",
    "        else:\n",
    "            ArrDelay = str(float(df.iloc[random_indice][\"ArrDelay\"]))\n",
    "        if df.iloc[random_indice][\"DepDelay\"] == \"\":\n",
    "            DepDelay = \"0\"\n",
    "            error_flag = True\n",
    "        else:\n",
    "            DepDelay = str(float(df.iloc[random_indice][\"DepDelay\"]))\n",
    "        answer = str(float(ArrDelay) - float(DepDelay))\n",
    "        if answer == '0.0':\n",
    "            answer = '0'\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 64,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0060', 'question': 'What was the local arrival time of the NK143 flight from BQN to MCO on 2022-02-16?', 'answer': '5:52'}, {'qid': 'easy-flight-0061', 'question': 'What was the local arrival time of the WN6075 flight from DEN to ATL on 2022-01-07?', 'answer': '17:31'}, {'qid': 'easy-flight-0062', 'question': 'What was the local arrival time of the UA504 flight from IAD to LAX on 2022-01-28?', 'answer': '8:57'}, {'qid': 'easy-flight-0063', 'question': 'What was the local arrival time of the NK1314 flight from FLL to DEN on 2022-06-11?', 'answer': '22:53'}, {'qid': 'easy-flight-0064', 'question': 'What was the local arrival time of the B61168 flight from PBI to HPN on 2022-01-05?', 'answer': '23:44'}, {'qid': 'easy-flight-0065', 'question': 'What was the local arrival time of the UA5415 flight from IAH to COS on 2022-07-21?', 'answer': '11:10'}, {'qid': 'easy-flight-0066', 'question': 'What was the local arrival time of the WN1360 flight from BWI to LAS on 2022-01-23?', 'answer': '8:10'}, {'qid': 'easy-flight-0067', 'question': 'What was the local arrival time of the DL4872 flight from TYS to LGA on 2022-03-28?', 'answer': '16:43'}, {'qid': 'easy-flight-0068', 'question': 'What was the local arrival time of the AA2281 flight from DFW to SAT on 2022-03-08?', 'answer': '19:03'}, {'qid': 'easy-flight-0069', 'question': 'What was the local arrival time of the DL5165 flight from XNA to MSP on 2022-01-17?', 'answer': '19:00'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"What was the local arrival time of the {}{} flight from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"ArrTime\"] == '':\n",
    "            DepTime = \"0\"\n",
    "            answer = DepTime\n",
    "        else:\n",
    "            DepTime = str(int(float(df.iloc[random_indice][\"ArrTime\"])))\n",
    "            answer = DepTime[:-2] + \":\" + DepTime[-2:]\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0070', 'question': 'What was the CRS-recorded arrival time of the AA2733 flight from DFW to LAX on 2022-01-30?', 'answer': '23:59'}, {'qid': 'easy-flight-0071', 'question': 'What was the CRS-recorded arrival time of the B61075 flight from PVD to MCO on 2022-02-11?', 'answer': '19:23'}, {'qid': 'easy-flight-0072', 'question': 'What was the CRS-recorded arrival time of the NK1697 flight from OAK to LAS on 2022-05-24?', 'answer': '22:29'}, {'qid': 'easy-flight-0073', 'question': 'What was the CRS-recorded arrival time of the WN1111 flight from SJC to LAS on 2022-05-02?', 'answer': '16:40'}, {'qid': 'easy-flight-0074', 'question': 'What was the CRS-recorded arrival time of the G42148 flight from SAV to CVG on 2022-07-08?', 'answer': '8:05'}, {'qid': 'easy-flight-0075', 'question': 'What was the CRS-recorded arrival time of the UA2240 flight from ORD to CHS on 2022-07-29?', 'answer': '22:07'}, {'qid': 'easy-flight-0076', 'question': 'What was the CRS-recorded arrival time of the AA6104 flight from SYR to BOS on 2022-06-20?', 'answer': '20:10'}, {'qid': 'easy-flight-0077', 'question': 'What was the CRS-recorded arrival time of the B61586 flight from MCO to BUF on 2022-05-23?', 'answer': '17:06'}, {'qid': 'easy-flight-0078', 'question': 'What was the CRS-recorded arrival time of the AA1683 flight from CLT to DTW on 2022-03-17?', 'answer': '9:02'}, {'qid': 'easy-flight-0079', 'question': 'What was the CRS-recorded arrival time of the UA4425 flight from EWR to DCA on 2022-06-29?', 'answer': '8:17'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"What was the CRS-recorded arrival time of the {}{} flight from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"CRSArrTime\"] == '':\n",
    "            CRSArrTime = \"0\"\n",
    "            answer = DepTime\n",
    "        else:\n",
    "            CRSArrTime = str(int(float(df.iloc[random_indice][\"CRSArrTime\"])))\n",
    "            answer = CRSArrTime[:-2] + \":\" + CRSArrTime[-2:]\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 66,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0080', 'question': 'How long was the flight NK115 from MCO to BDL on 2022-07-08?', 'answer': '142'}, {'qid': 'easy-flight-0081', 'question': 'How long was the flight AS809 from PDX to OGG on 2022-02-11?', 'answer': '332'}, {'qid': 'easy-flight-0082', 'question': 'How long was the flight UA5045 from DVL to JMS on 2022-03-10?', 'answer': '31'}, {'qid': 'easy-flight-0083', 'question': 'How long was the flight DL441 from DTW to ATL on 2022-05-09?', 'answer': '81'}, {'qid': 'easy-flight-0084', 'question': 'How long was the flight UA662 from EWR to DFW on 2022-04-22?', 'answer': '179'}, {'qid': 'easy-flight-0085', 'question': 'How long was the flight AA793 from BHM to DFW on 2022-05-01?', 'answer': '96'}, {'qid': 'easy-flight-0086', 'question': 'How long was the flight WN327 from TPA to BNA on 2022-03-01?', 'answer': '92'}, {'qid': 'easy-flight-0087', 'question': 'How long was the flight AA3016 from PSP to PHX on 2022-01-23?', 'answer': '48'}, {'qid': 'easy-flight-0088', 'question': 'How long was the flight UA5286 from DEN to ICT on 2022-03-11?', 'answer': '58'}, {'qid': 'easy-flight-0089', 'question': 'How long was the flight AA6030 from PHL to BGR on 2022-05-08?', 'answer': '77'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"How long was the flight {}{} from {} to {} on {}?\".format(carrier, number, Origin, Dest, FlightDate)\n",
    "        if df.iloc[random_indice][\"AirTime\"] == '' or pd.isna(df.iloc[random_indice][\"AirTime\"]):\n",
    "            AirTime = \"0\"\n",
    "            answer = AirTime\n",
    "        else:\n",
    "            AirTime = str(int(float(df.iloc[random_indice][\"AirTime\"])))\n",
    "            answer = AirTime\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 67,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[{'qid': 'easy-flight-0090', 'question': 'How many minutes did the DL3538 flight take to taxi in on 2022-06-26?', 'answer': '3'}, {'qid': 'easy-flight-0091', 'question': 'How many minutes did the AS617 flight take to taxi in on 2022-02-26?', 'answer': '11'}, {'qid': 'easy-flight-0092', 'question': 'How many minutes did the WN1069 flight take to taxi in on 2022-02-02?', 'answer': '6'}, {'qid': 'easy-flight-0093', 'question': 'How many minutes did the UA450 flight take to taxi in on 2022-07-31?', 'answer': '4'}, {'qid': 'easy-flight-0094', 'question': 'How many minutes did the AA802 flight take to taxi in on 2022-03-25?', 'answer': '12'}, {'qid': 'easy-flight-0095', 'question': 'How many minutes did the WN1537 flight take to taxi in on 2022-03-13?', 'answer': '5'}, {'qid': 'easy-flight-0096', 'question': 'How many minutes did the AA4804 flight take to taxi in on 2022-02-20?', 'answer': '8'}, {'qid': 'easy-flight-0097', 'question': 'How many minutes did the B6623 flight take to taxi in on 2022-05-05?', 'answer': '32'}, {'qid': 'easy-flight-0098', 'question': 'How many minutes did the DL4967 flight take to taxi in on 2022-04-13?', 'answer': '9'}, {'qid': 'easy-flight-0099', 'question': 'How many minutes did the WN2517 flight take to taxi in on 2022-02-08?', 'answer': '6'}]\n"
     ]
    }
   ],
   "source": [
    "for i in range(num_questions_per_template):\n",
    "    answer = \"0\"\n",
    "    while answer == \"0\":\n",
    "        random_indice = random.randint(0, len(df)-1)\n",
    "        carrier = df.iloc[random_indice][\"IATA_Code_Marketing_Airline\"]\n",
    "        number = df.iloc[random_indice][\"Flight_Number_Marketing_Airline\"]\n",
    "        Origin = df.iloc[random_indice][\"Origin\"]\n",
    "        Dest = df.iloc[random_indice][\"Dest\"]\n",
    "        FlightDate = df.iloc[random_indice][\"FlightDate\"]\n",
    "        question = \"How many minutes did the {}{} flight take to taxi in on {}?\".format(carrier, number, FlightDate)\n",
    "        if df.iloc[random_indice][\"TaxiIn\"] == '':\n",
    "            TaxiIN = \"0\"\n",
    "            answer = TaxiIN\n",
    "        else:\n",
    "            TaxiIN = str(int(float(df.iloc[random_indice][\"TaxiIn\"])))\n",
    "            answer = TaxiIN\n",
    "    questions.append({\"qid\": \"easy-flight-{:0>4d}\".format(question_id), \"question\":question, \"answer\":answer})\n",
    "    question_id += 1\n",
    "print(questions[-10:])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "with jsonlines.open('/<YOUR_OWN_PATH>/ToolQA/data/questions/easy/flights-easy.jsonl', mode='w') as writer:\n",
    "    for row in questions:\n",
    "        writer.write(row)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "llm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.16"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
